[FLINK]Support processing time tumble window for nexmark q12#13
[FLINK]Support processing time tumble window for nexmark q12#13KevinyhZou wants to merge 19 commits intobigo-sg:gluten-0530from
Conversation
| // Look for the end of the last group. | ||
| vector_size_t index = 0; | ||
| if (prevInput_) { | ||
| if (prevInput_ && numGroups_ > 0) { |
| RowVectorPtr output; | ||
|
|
||
| if (numGroups_ > minOutputBatchSize_) { | ||
| if (numGroups_ >= minOutputBatchSize_) { |
There was a problem hiding this comment.
对,velox 里面minOutputBatchSize 最小只能取1, numGroups > minOutputBatchSize 意味着当只有一条数据的时候,是不输出的,这不符合流计算的场景。
| } | ||
| } | ||
|
|
||
| long TimeWindowUtil::getCurrentProcessingTime() { |
|
|
||
| static long cleanupTime(long maxTimestamp, long allowedLateness_, bool isEventTime); | ||
|
|
||
| static long getCurrentProcessingTime(); |
There was a problem hiding this comment.
static int64_t getCurrentProcessingTime(); since sizeof(long) may 4 bytes on 32 bit machine.
| now.time_since_epoch()).count(); | ||
| return {{timestamp_ms, input}}; | ||
| long timestamp_ms = TimeWindowUtil::getCurrentProcessingTime(); | ||
| if (windowType_ == 1) { // tumble window |
| ProcessingTimeCallback callback_; | ||
| }; | ||
|
|
||
| class ProcessingTimeSerivice { |
| return TimeWindowUtil::getCurrentProcessingTime(); | ||
| } | ||
| virtual std::optional<std::string> registerTimer(long timestamp, ProcessingTimerTask target) { | ||
| std::optional<std::string> task; |
There was a problem hiding this comment.
用纯虚函数,virtual std::optional<std::string> registerTimer(long timestamp, ProcessingTimerTask target) = 0;
| virtual void cancel(const std::string& task) {} | ||
| virtual void close() {} | ||
|
|
||
| void finish(const std::string& task) { |
| return "proc_time_task_" + std::to_string(timestamp); | ||
| } | ||
| protected: | ||
| std::vector<std::string> registry; |
There was a problem hiding this comment.
用std::set是不是更好? vector不适合频繁查找和删除。
|
|
||
| namespace facebook::velox::stateful { | ||
|
|
||
| using ProcessingTimeCallback = std::function<void(long)>; |
There was a problem hiding this comment.
long统一都改成int64_t吧 避免32位系统的溢出
| } | ||
|
|
||
| std::string generateTimerTaskName(long timestamp) { | ||
| return "proc_time_task_" + std::to_string(timestamp); |
There was a problem hiding this comment.
如果在统一ms注册 这个可能冲突?加个counter
static std::atomic<uint64_t> counter{0};
return "proc_time_task_" + std::to_string(timestamp) + "_" + std::to_string(counter++);
| void close() override { | ||
| if (executor_) { | ||
| executor_->shutdown(); | ||
| } |
|
|
||
| /// This class is relevent to flink KeySelector. | ||
| /// It can partition the RowVector according to the key fields. | ||
| class KeySelector { |
There was a problem hiding this comment.
KeySelector 应改是KeyPartitioner?
| @@ -31,7 +31,7 @@ | |||
| memory::MemoryPool* pool, | |||
| int numPartitions = INT_MAX); | |||
| } | ||
|
|
||
| std::map<uint32_t, RowVectorPtr> KeySelector::partition(const RowVectorPtr& input) { | ||
| std::map<uint64_t, RowVectorPtr> KeySelector::partition(const RowVectorPtr& input) { |
There was a problem hiding this comment.
这个接口有点重,不应该用map 不过跟你这次改动没关系了 后面再改吧
| } | ||
| } | ||
|
|
||
| void KeySelector::allocateIndexBuffers( |
There was a problem hiding this comment.
这个接口实现重,这样更好
struct IndexBuffers {
std::vector<BufferPtr> buffers;
std::vector<vector_size_t*> rawPtrs;
};
IndexBuffers allocateIndexBuffers(const std::vector<vector_size_t>& counts);
| #include "velox/experimental/stateful/Triggerable.h" | ||
| #include "velox/experimental/stateful/window/SliceAssigner.h" | ||
| #include "velox/experimental/stateful/window/WindowBuffer.h" | ||
| #include <memory> |
| StatefulOperator::close(); | ||
| localAggerator_->close(); | ||
| if (localAggerator_) { | ||
| localAggerator_->close(); |
| #include "velox/experimental/stateful/window/TimeWindowUtil.h" | ||
|
|
||
| #include <list> | ||
| #include <memory> |
| StatefulOperator::initialize(); | ||
| localAggerator_->initialize(); | ||
| if (localAggerator_) { | ||
| localAggerator_->initialize(); |
| const int windowStartIndex, | ||
| const int windowEndIndex) | ||
| : StatefulOperator(std::move(globalAggerator), std::move(targets)), | ||
| localAggerator_(std::move(localAggerator)), |
| @@ -149,7 +272,9 @@ long WindowAggregator::sliceStateMergeTarget(long sliceToMerge) { | |||
| void WindowAggregator::close() { | |||
| processWatermarkInternal(INT_MAX); | |||
There was a problem hiding this comment.
processWatermarkInternal(std::numeric_limits<int64_t>::max());
| const RowVectorPtr& output, | ||
| const std::string& fieldName, | ||
| const TypePtr& fieldType, | ||
| const long fieldValue, |
| RowVectorPtr addWindowTimestampToOutput( | ||
| const RowVectorPtr& output, | ||
| const std::string& fieldName, | ||
| const TypePtr& fieldType, |
| std::list<RowVectorPtr> allDatas; | ||
| for (const auto& data: datas) { | ||
| allDatas.push_back(data); | ||
| } |
There was a problem hiding this comment.
std::vector<RowVectorPtr> allDatas(datas.begin(), datas.end());
|
|
||
| template<typename K> | ||
| void WindowAggregator::fireWindow(K key, long timerTimestamp, long windowEnd) { | ||
| RowVectorPtr output = windowState_->value(key, windowEnd); |
There was a problem hiding this comment.
if (!output) {
LOG(INFO) << "No output found for key: " << key << ", window end: " << windowEnd;
return;
}
| windowState_->remove(key, windowEnd); | ||
| } | ||
|
|
||
| void WindowAggregator::onEventTime(std::shared_ptr<TimerHeapInternalTimer<uint32_t, long>> timer) { |
| onTimer(timer); | ||
| } | ||
|
|
||
| void WindowAggregator::onProcessingTime(std::shared_ptr<TimerHeapInternalTimer<uint32_t, long>> timer) { |
|
|
||
| template<typename K> | ||
| void WindowAggregator::clearWindow(K key, long timerTimestamp, long windowEnd) { | ||
| windowState_->remove(key, windowEnd); |
| size_ = 0; | ||
| } | ||
|
|
||
| bool empty() override { |
There was a problem hiding this comment.
bool empty() const override
|
|
||
| namespace facebook::velox::stateful { | ||
|
|
||
| static int roundUpToPowerOfTwo(int32_t x) { |
There was a problem hiding this comment.
参照ck的实现
template <typename T>
requires std::is_integral_v<T> && (sizeof(T) == sizeof(UInt64))
inline T roundDownToPowerOfTwo(T x)
{
return x <= 0 ? 0 : (T(1) << (63 - __builtin_clzll(x)));
}
| int numPartitions = 1024); | ||
|
|
||
| std::map<uint32_t, RowVectorPtr> partition(const RowVectorPtr& input); | ||
| std::map<int64_t, RowVectorPtr> partition(const RowVectorPtr& input); |
There was a problem hiding this comment.
key使用一个类型别名吧,估计后面需要换类型,64位hash值还是有可能会出现冲突。map换 unordered_map
StreamingAggreagate